This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
Libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(ggplot2)
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
library(corrplot)
## corrplot 0.84 loaded
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(quantmod)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
## Loading required package: TTR
## Version 0.4-0 included new data defaults. See ?getSymbols.
library(e1071)
library(useful)
## Registered S3 methods overwritten by 'useful':
## method from
## autoplot.acf forecast
## fortify.ts forecast
##
## Attaching package: 'useful'
## The following object is masked from 'package:xts':
##
## reclass
library(xts)
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
library(DiagrammeR)
library(ranger)
##
## Attaching package: 'ranger'
## The following object is masked from 'package:randomForest':
##
## importance
library(vtreat)
## Loading required package: wrapr
##
## Attaching package: 'wrapr'
## The following object is masked from 'package:dplyr':
##
## coalesce
Notes on features by Walmart:
This file contains additional data related to the store, department, and regional activity for the given dates. It contains the following fields:
Store - the store number Date - the week Temperature - average temperature in the region Fuel_Price - cost of fuel in the region MarkDown1-5 - anonymized data related to promotional markdowns that Walmart is running. MarkDown data is only available after Nov 2011, and is not available for all stores all the time. Any missing value is marked with an NA. CPI - the consumer price index Unemployment - the unemployment rate IsHoliday - whether the week is a special holiday week For convenience, the four holidays fall within the following weeks in the dataset (not all holidays are in the data):
Super Bowl: 12-Feb-10, 11-Feb-11, 10-Feb-12, 8-Feb-13 Labor Day: 10-Sep-10, 9-Sep-11, 7-Sep-12, 6-Sep-13 Thanksgiving: 26-Nov-10, 25-Nov-11, 23-Nov-12, 29-Nov-13 Christmas: 31-Dec-10, 30-Dec-11, 28-Dec-12, 27-Dec-13
#loading three data sets "train.csv", "stores.csv", "features.csv" downloaded from Kaggle
salesdata<-read.csv("train.csv", sep = ",")
str(salesdata)
## 'data.frame': 421570 obs. of 5 variables:
## $ Store : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Dept : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : Factor w/ 143 levels "2010-02-05","2010-02-12",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Weekly_Sales: num 24924 46039 41596 19404 21828 ...
## $ IsHoliday : logi FALSE TRUE FALSE FALSE FALSE FALSE ...
storesdata<-read.csv("stores.csv", sep = ",")
str(storesdata)
## 'data.frame': 45 obs. of 3 variables:
## $ Store: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Type : Factor w/ 3 levels "A","B","C": 1 1 2 1 2 1 2 1 2 2 ...
## $ Size : int 151315 202307 37392 205863 34875 202505 70713 155078 125833 126512 ...
features<-read.csv("features.csv", sep = ",")
str(features)
## 'data.frame': 8190 obs. of 12 variables:
## $ Store : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : Factor w/ 182 levels "2010-02-05","2010-02-12",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Temperature : num 42.3 38.5 39.9 46.6 46.5 ...
## $ Fuel_Price : num 2.57 2.55 2.51 2.56 2.62 ...
## $ MarkDown1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown2 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown3 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown4 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown5 : num NA NA NA NA NA NA NA NA NA NA ...
## $ CPI : num 211 211 211 211 211 ...
## $ Unemployment: num 8.11 8.11 8.11 8.11 8.11 ...
## $ IsHoliday : logi FALSE TRUE FALSE FALSE FALSE FALSE ...
Checking data quality and find no NA in sales and stores data
sum(is.na(salesdata))
## [1] 0
sum(is.na(storesdata))
## [1] 0
Merging three data sets by common elements
mergedata <- merge(salesdata,storesdata, by.salesdata = T)
sum(is.na(mergedata))
## [1] 0
alldata<-merge(mergedata, features, by = intersect(names(salesdata), names(features)))
str(alldata)
## 'data.frame': 421570 obs. of 16 variables:
## $ Store : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : Factor w/ 143 levels "2010-02-05","2010-02-12",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ IsHoliday : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Dept : int 1 26 17 45 28 79 55 5 58 7 ...
## $ Weekly_Sales: num 24924.5 11737.1 13223.8 37.4 1085.3 ...
## $ Type : Factor w/ 3 levels "A","B","C": 1 1 1 1 1 1 1 1 1 1 ...
## $ Size : int 151315 151315 151315 151315 151315 151315 151315 151315 151315 151315 ...
## $ Temperature : num 42.3 42.3 42.3 42.3 42.3 ...
## $ Fuel_Price : num 2.57 2.57 2.57 2.57 2.57 ...
## $ MarkDown1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown2 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown3 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown4 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown5 : num NA NA NA NA NA NA NA NA NA NA ...
## $ CPI : num 211 211 211 211 211 ...
## $ Unemployment: num 8.11 8.11 8.11 8.11 8.11 ...
Review attributes and modify data type as needed
#change $Store, $Department variables to factor variables
alldata$Store <- as.factor(alldata$Store)
alldata$Dept <- as.factor(alldata$Dept)
#change $Date variable from factor to date variable
alldata$Date <- as.Date(alldata$Date)
#count the position of $Date(weekly) in a year
alldata$Week <-isoweek (alldata$Date)
str(alldata)
## 'data.frame': 421570 obs. of 17 variables:
## $ Store : Factor w/ 45 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : Date, format: "2010-02-05" "2010-02-05" ...
## $ IsHoliday : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Dept : Factor w/ 81 levels "1","2","3","4",..: 1 25 16 44 27 65 53 5 55 7 ...
## $ Weekly_Sales: num 24924.5 11737.1 13223.8 37.4 1085.3 ...
## $ Type : Factor w/ 3 levels "A","B","C": 1 1 1 1 1 1 1 1 1 1 ...
## $ Size : int 151315 151315 151315 151315 151315 151315 151315 151315 151315 151315 ...
## $ Temperature : num 42.3 42.3 42.3 42.3 42.3 ...
## $ Fuel_Price : num 2.57 2.57 2.57 2.57 2.57 ...
## $ MarkDown1 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown2 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown3 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown4 : num NA NA NA NA NA NA NA NA NA NA ...
## $ MarkDown5 : num NA NA NA NA NA NA NA NA NA NA ...
## $ CPI : num 211 211 211 211 211 ...
## $ Unemployment: num 8.11 8.11 8.11 8.11 8.11 ...
## $ Week : num 5 5 5 5 5 5 5 5 5 5 ...
summary(alldata)
## Store Date IsHoliday Dept
## 13 : 10474 Min. :2010-02-05 Mode :logical 1 : 6435
## 10 : 10315 1st Qu.:2010-10-08 FALSE:391909 2 : 6435
## 4 : 10272 Median :2011-06-17 TRUE :29661 3 : 6435
## 1 : 10244 Mean :2011-06-18 4 : 6435
## 2 : 10238 3rd Qu.:2012-02-24 7 : 6435
## 24 : 10228 Max. :2012-10-26 8 : 6435
## (Other):359799 (Other):382960
## Weekly_Sales Type Size Temperature
## Min. : -4989 A:215478 Min. : 34875 Min. : -2.06
## 1st Qu.: 2080 B:163495 1st Qu.: 93638 1st Qu.: 46.68
## Median : 7612 C: 42597 Median :140167 Median : 62.09
## Mean : 15981 Mean :136728 Mean : 60.09
## 3rd Qu.: 20206 3rd Qu.:202505 3rd Qu.: 74.28
## Max. :693099 Max. :219622 Max. :100.14
##
## Fuel_Price MarkDown1 MarkDown2 MarkDown3
## Min. :2.472 Min. : 0.27 Min. : -265.8 Min. : -29.10
## 1st Qu.:2.933 1st Qu.: 2240.27 1st Qu.: 41.6 1st Qu.: 5.08
## Median :3.452 Median : 5347.45 Median : 192.0 Median : 24.60
## Mean :3.361 Mean : 7246.42 Mean : 3334.6 Mean : 1439.42
## 3rd Qu.:3.738 3rd Qu.: 9210.90 3rd Qu.: 1926.9 3rd Qu.: 103.99
## Max. :4.468 Max. :88646.76 Max. :104519.5 Max. :141630.61
## NA's :270889 NA's :310322 NA's :284479
## MarkDown4 MarkDown5 CPI Unemployment
## Min. : 0.22 Min. : 135.2 Min. :126.1 Min. : 3.879
## 1st Qu.: 504.22 1st Qu.: 1878.4 1st Qu.:132.0 1st Qu.: 6.891
## Median : 1481.31 Median : 3359.4 Median :182.3 Median : 7.866
## Mean : 3383.17 Mean : 4629.0 Mean :171.2 Mean : 7.960
## 3rd Qu.: 3595.04 3rd Qu.: 5563.8 3rd Qu.:212.4 3rd Qu.: 8.572
## Max. :67474.85 Max. :108519.3 Max. :227.2 Max. :14.313
## NA's :286603 NA's :270138
## Week
## Min. : 1.00
## 1st Qu.:14.00
## Median :26.00
## Mean :25.83
## 3rd Qu.:38.00
## Max. :52.00
##
Counting NAs
#count numbers of NAs
apply(alldata, MARGIN = 2, function (x) sum(is.na(x)))
## Store Date IsHoliday Dept Weekly_Sales
## 0 0 0 0 0
## Type Size Temperature Fuel_Price MarkDown1
## 0 0 0 0 270889
## MarkDown2 MarkDown3 MarkDown4 MarkDown5 CPI
## 310322 284479 286603 270138 0
## Unemployment Week
## 0 0
#calculate percentages of NAs
apply(alldata, MARGIN = 2, function (x) sum(is.na(x))/length(x))
## Store Date IsHoliday Dept Weekly_Sales
## 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## Type Size Temperature Fuel_Price MarkDown1
## 0.0000000 0.0000000 0.0000000 0.0000000 0.6425718
## MarkDown2 MarkDown3 MarkDown4 MarkDown5 CPI
## 0.7361103 0.6748085 0.6798468 0.6407904 0.0000000
## Unemployment Week
## 0.0000000 0.0000000
#Results: five "Markdown" variables have above 65% of NAs in their data sets respectively
Check negative numbers
table(sign(alldata$Weekly_Sales))
##
## -1 0 1
## 1285 73 420212
table(sign(alldata$MarkDown1))
##
## 1
## 150681
table(sign(alldata$MarkDown2))
##
## -1 0 1
## 1311 207 109730
table(sign(alldata$MarkDown3))
##
## -1 0 1
## 257 67 136767
table(sign(alldata$MarkDown4))
##
## 1
## 134967
table(sign(alldata$MarkDown5))
##
## 1
## 151432
Exploring data characteristics
#log Weekly Department-Store Sales histogram
#log per department per store weekly sales appear mount-shaped and left skewed
ggplot(alldata) + geom_histogram (aes(x= Weekly_Sales), bins = 100) + scale_x_continuous(trans = "log10")
## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 1358 rows containing non-finite values (stat_bin).
#aggregate department (total 99 department ID) "Weekly Sales" variable into store (total 45 stores) weekly sales , log histogram
#store weekly sales distribution is not normally distributed, need to separate data by "Type" variable
alldata %>% dplyr::select (Weekly_Sales, Store, Date) %>% group_by(Store, Date) %>% summarise (WKlyStoreSales = sum(Weekly_Sales))%>%
ggplot(aes(x=WKlyStoreSales)) + geom_histogram() + scale_x_continuous(trans = "log10")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#store weekly sales grouped by Type, log histogram
#store type data shows mount shaped
alldata %>% dplyr::select (Weekly_Sales, Store, Date, Type) %>% group_by(Store, Date, Type) %>% summarise (WKlyStoreSales = sum(Weekly_Sales))%>%
ggplot(aes(x=WKlyStoreSales)) + geom_histogram() + facet_grid(Type~.) + scale_x_continuous(trans = "log10")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#divide store weekly sale data by size and graph histogram
alldata %>% mutate (WKlySizeSales = Weekly_Sales/Size) %>%
group_by (Date, Store) %>%
summarize (PerStoreSizeSales = sum(WKlySizeSales)) %>%
ggplot(aes(x=PerStoreSizeSales), binwidth = 10) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#what sizes of store belong to which store type and count the numbers
alldata %>% dplyr::select (Store, Size, Type) %>% distinct () %>%
ggplot( aes(x = Size)) + geom_histogram() + facet_grid(Type~.)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Seasonality by store types: sales spike for Thanksgiving and Christmas holidays
#plot store weekly sales data by store type (A,B,C) against 143 week-time period
#strong seasonality
alldata %>% group_by (Type, Date) %>% summarize (WeeklySales = sum(Weekly_Sales)) %>%
ggplot(aes (x = Date, y = WeeklySales)) + geom_line(aes(color = factor(Type), group = Type)) +scale_y_log10()+scale_x_date(date_breaks = "4 weeks")+ theme(axis.text.x = element_text(angle = 90, hjust = 1))
#plot store sales per size against 143-week time period to check if seasonality stays
alldata %>% group_by (Date) %>% summarize (Mean_WeeklySales = mean(Weekly_Sales)) %>%
ggplot(aes (x = Date, y = Mean_WeeklySales)) + geom_line()+geom_smooth()+ scale_x_date(date_breaks = "4 weeks")+ theme(axis.text.x = element_text(angle = 90, hjust = 1))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Checking IsHoliday against holiday calendars
#IsHoliday variable appears at the same week in a year
ggplot(alldata, aes(x = IsHoliday, y = Week)) + geom_point()
alldata %>%dplyr:: select (Date, IsHoliday, Week) %>% filter (IsHoliday == "TRUE") %>% distinct()
## Date IsHoliday Week
## 1 2010-02-12 TRUE 6
## 2 2010-09-10 TRUE 36
## 3 2010-11-26 TRUE 47
## 4 2010-12-31 TRUE 52
## 5 2011-02-11 TRUE 6
## 6 2011-09-09 TRUE 36
## 7 2011-11-25 TRUE 47
## 8 2011-12-30 TRUE 52
## 9 2012-02-10 TRUE 6
## 10 2012-09-07 TRUE 36
weekly sales correlations with CPI, Fuel Price, Temprature, Unemployment
#corrplot shows no correlations
attach(alldata)
M<-data.frame(Weekly_Sales,Fuel_Price, Temperature, CPI, Unemployment)
MCor<- cor(M)
corrplot( MCor, order = "AOE")
Markdown feature
#Filter dataset with MarkDowns
MKD<-alldata %>% filter(Date >= "2012-02-03")
MKD2<-alldata %>% filter(Date <"2012-02-03")
Check NAs in MarkDowns after 2012-02-03 (temporal split for test set)
#count NA numbers
apply(MKD, MARGIN = 2, function (x) sum(is.na(x)))
## Store Date IsHoliday Dept Weekly_Sales
## 0 0 0 0 0
## Type Size Temperature Fuel_Price MarkDown1
## 0 0 0 0 457
## MarkDown2 MarkDown3 MarkDown4 MarkDown5 CPI
## 36292 11813 12050 0 0
## Unemployment Week
## 0 0
#count NA percentages
apply(MKD, MARGIN = 2, function (x) sum(is.na(x))/length(x))
## Store Date IsHoliday Dept Weekly_Sales
## 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## Type Size Temperature Fuel_Price MarkDown1
## 0.000000000 0.000000000 0.000000000 0.000000000 0.003953698
## MarkDown2 MarkDown3 MarkDown4 MarkDown5 CPI
## 0.313977229 0.102199190 0.104249576 0.000000000 0.000000000
## Unemployment Week
## 0.000000000 0.000000000
Check NAs in MarkDowns before 2012-02-03
#Markdown features have more than 88% NAs before 2012-02-03
apply(MKD2, MARGIN = 2, function (x) sum(is.na(x)))
## Store Date IsHoliday Dept Weekly_Sales
## 0 0 0 0 0
## Type Size Temperature Fuel_Price MarkDown1
## 0 0 0 0 270432
## MarkDown2 MarkDown3 MarkDown4 MarkDown5 CPI
## 274030 272666 274553 270138 0
## Unemployment Week
## 0 0
apply(MKD2, MARGIN = 2, function (x) sum(is.na(x))/length(x))
## Store Date IsHoliday Dept Weekly_Sales
## 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## Type Size Temperature Fuel_Price MarkDown1
## 0.0000000 0.0000000 0.0000000 0.0000000 0.8838167
## MarkDown2 MarkDown3 MarkDown4 MarkDown5 CPI
## 0.8955756 0.8911178 0.8972848 0.8828559 0.0000000
## Unemployment Week
## 0.0000000 0.0000000
Boxplot charts of MarkDown data after 2012-02-03 (because of less missing values for this period)
#subset MarkDown features
MKDdata<-cbind(MKD$MarkDown1, MKD$MarkDown2, MKD$MarkDown3, MKD$MarkDown4, MKD$MarkDown5)
#boxplot
boxplot(MKDdata)
#scatterplot department markdown data over 143-week period
MKD %>% ggplot(aes(x = Date, y = MarkDown1))+ geom_point()+scale_x_date(date_breaks = "4 weeks")+ theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Warning: Removed 457 rows containing missing values (geom_point).
MKD %>% ggplot(aes(x = Date, y = MarkDown2))+ geom_point()+scale_x_date(date_breaks = "4 weeks")+ theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Warning: Removed 36292 rows containing missing values (geom_point).
MKD %>% ggplot(aes(x = Date, y = MarkDown3))+ geom_point()+scale_x_date(date_breaks = "4 weeks")+ theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Warning: Removed 11813 rows containing missing values (geom_point).
MKD %>% ggplot(aes(x = Date, y = MarkDown4))+ geom_point()+scale_x_date(date_breaks = "4 weeks")+ theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Warning: Removed 12050 rows containing missing values (geom_point).
MKD %>% ggplot(aes(x = Date, y = MarkDown5))+ geom_point()+scale_x_date(date_breaks = "4 weeks")+ theme(axis.text.x = element_text(angle = 90, hjust = 1))
Correlation between MarkDowns and Weekly Sales
SM<-alldata %>% dplyr::select (Weekly_Sales, MarkDown1, MarkDown2, MarkDown3, MarkDown4, MarkDown5) %>%
filter (Date >= "2012-02-03")
ggplot(SM, aes(x=MarkDown1, y = Weekly_Sales)) + geom_point()
## Warning: Removed 457 rows containing missing values (geom_point).
ggplot(SM, aes(x = MarkDown2, y = Weekly_Sales)) + geom_point()
## Warning: Removed 36292 rows containing missing values (geom_point).
ggplot(SM, aes(x = MarkDown3, y = Weekly_Sales)) + geom_point()
## Warning: Removed 11813 rows containing missing values (geom_point).
ggplot(SM, aes(x= MarkDown4, y = Weekly_Sales))+geom_point()
## Warning: Removed 12050 rows containing missing values (geom_point).
ggplot(SM, aes(x=MarkDown5, y = Weekly_Sales)) + geom_point()
Drop MarkDown features and external features because of large NAs and no strong correlation with Weekly Sales
#drop features
data_clean<- alldata %>% dplyr::select(-MarkDown1, -MarkDown2, -MarkDown3, -MarkDown4, -MarkDown5, -CPI, -Temperature, -Fuel_Price, -Unemployment)
#keep features: weekly sales(numeric), store (factor), dept (factor), Date, IsHoliday(logic), Week(numeric), Type(factor), Size (integer)
str(data_clean)
## 'data.frame': 421570 obs. of 8 variables:
## $ Store : Factor w/ 45 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : Date, format: "2010-02-05" "2010-02-05" ...
## $ IsHoliday : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Dept : Factor w/ 81 levels "1","2","3","4",..: 1 25 16 44 27 65 53 5 55 7 ...
## $ Weekly_Sales: num 24924.5 11737.1 13223.8 37.4 1085.3 ...
## $ Type : Factor w/ 3 levels "A","B","C": 1 1 1 1 1 1 1 1 1 1 ...
## $ Size : int 151315 151315 151315 151315 151315 151315 151315 151315 151315 151315 ...
## $ Week : num 5 5 5 5 5 5 5 5 5 5 ...
Stepwise feature selection
#feature selection
#nullModel<-lm(Weekly_Sales~1, data_clean)
#fullModel<-lm(Weekly_Sales~Dept+Store+IsHoliday+Week+Size+Type, data = data_clean)
#SalesStep<-step(nullModel, scope = (list(lower=nullModel, upper = fullModel)), direction = "both")
Prepare data set for model building
#checking department-store level
# 3331 department-store levels
#data_clean$DeptID<-as.factor(paste("D", data_ts$Dept, "S", data_ts$Store, sep=""))
#levels(data_clean[,"DeptID"])
#drop DeptID variable due to computation inefficiency
#data_clean<-subset(data_clean, select = -c(DeptID))
str(data_clean)
## 'data.frame': 421570 obs. of 8 variables:
## $ Store : Factor w/ 45 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Date : Date, format: "2010-02-05" "2010-02-05" ...
## $ IsHoliday : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Dept : Factor w/ 81 levels "1","2","3","4",..: 1 25 16 44 27 65 53 5 55 7 ...
## $ Weekly_Sales: num 24924.5 11737.1 13223.8 37.4 1085.3 ...
## $ Type : Factor w/ 3 levels "A","B","C": 1 1 1 1 1 1 1 1 1 1 ...
## $ Size : int 151315 151315 151315 151315 151315 151315 151315 151315 151315 151315 ...
## $ Week : num 5 5 5 5 5 5 5 5 5 5 ...
#aggregate sales to store sales
#data size: 45 store weekly sales data for 143 weeks
data_agg_store<-data_clean %>% select(Weekly_Sales, Date, Store) %>% group_by(Date, Store) %>% summarise(Store_Weekly_Sales = sum(Weekly_Sales))
summary(data_agg_store)
## Date Store Store_Weekly_Sales
## Min. :2010-02-05 1 : 143 Min. : 209986
## 1st Qu.:2010-10-08 2 : 143 1st Qu.: 553350
## Median :2011-06-17 3 : 143 Median : 960746
## Mean :2011-06-17 4 : 143 Mean :1046965
## 3rd Qu.:2012-02-24 5 : 143 3rd Qu.:1420159
## Max. :2012-10-26 6 : 143 Max. :3818686
## (Other):5577
Split train/test sets by temporal point “2012-02-03”, about 73/27 split by week numbers
train<- data_agg_store%>% filter (Date < "2012-02-03")
test<-data_agg_store %>% filter (Date >= "2012-02-03")
summary(train)
## Date Store Store_Weekly_Sales
## Min. :2010-02-05 1 : 104 Min. : 209986
## 1st Qu.:2010-08-04 2 : 104 1st Qu.: 549523
## Median :2011-01-31 3 : 104 Median : 956062
## Mean :2011-01-31 4 : 104 Mean :1048286
## 3rd Qu.:2011-07-30 5 : 104 3rd Qu.:1415272
## Max. :2012-01-27 6 : 104 Max. :3818686
## (Other):4056
summary(test)
## Date Store Store_Weekly_Sales
## Min. :2012-02-03 1 : 39 Min. : 237130
## 1st Qu.:2012-04-06 2 : 39 1st Qu.: 561700
## Median :2012-06-15 3 : 39 Median : 968897
## Mean :2012-06-15 4 : 39 Mean :1043441
## 3rd Qu.:2012-08-24 5 : 39 3rd Qu.:1439366
## Max. :2012-10-26 6 : 39 Max. :2565260
## (Other):1521
#create xts series
#create time index
train_Date_index<-seq(as.Date("2010-02-05"), length.out = 104, by = "week")
test_Date_index<-seq(as.Date("2012-02-03"), length.out = 39, by = "week")
train_Date_index
## [1] "2010-02-05" "2010-02-12" "2010-02-19" "2010-02-26" "2010-03-05"
## [6] "2010-03-12" "2010-03-19" "2010-03-26" "2010-04-02" "2010-04-09"
## [11] "2010-04-16" "2010-04-23" "2010-04-30" "2010-05-07" "2010-05-14"
## [16] "2010-05-21" "2010-05-28" "2010-06-04" "2010-06-11" "2010-06-18"
## [21] "2010-06-25" "2010-07-02" "2010-07-09" "2010-07-16" "2010-07-23"
## [26] "2010-07-30" "2010-08-06" "2010-08-13" "2010-08-20" "2010-08-27"
## [31] "2010-09-03" "2010-09-10" "2010-09-17" "2010-09-24" "2010-10-01"
## [36] "2010-10-08" "2010-10-15" "2010-10-22" "2010-10-29" "2010-11-05"
## [41] "2010-11-12" "2010-11-19" "2010-11-26" "2010-12-03" "2010-12-10"
## [46] "2010-12-17" "2010-12-24" "2010-12-31" "2011-01-07" "2011-01-14"
## [51] "2011-01-21" "2011-01-28" "2011-02-04" "2011-02-11" "2011-02-18"
## [56] "2011-02-25" "2011-03-04" "2011-03-11" "2011-03-18" "2011-03-25"
## [61] "2011-04-01" "2011-04-08" "2011-04-15" "2011-04-22" "2011-04-29"
## [66] "2011-05-06" "2011-05-13" "2011-05-20" "2011-05-27" "2011-06-03"
## [71] "2011-06-10" "2011-06-17" "2011-06-24" "2011-07-01" "2011-07-08"
## [76] "2011-07-15" "2011-07-22" "2011-07-29" "2011-08-05" "2011-08-12"
## [81] "2011-08-19" "2011-08-26" "2011-09-02" "2011-09-09" "2011-09-16"
## [86] "2011-09-23" "2011-09-30" "2011-10-07" "2011-10-14" "2011-10-21"
## [91] "2011-10-28" "2011-11-04" "2011-11-11" "2011-11-18" "2011-11-25"
## [96] "2011-12-02" "2011-12-09" "2011-12-16" "2011-12-23" "2011-12-30"
## [101] "2012-01-06" "2012-01-13" "2012-01-20" "2012-01-27"
test_Date_index
## [1] "2012-02-03" "2012-02-10" "2012-02-17" "2012-02-24" "2012-03-02"
## [6] "2012-03-09" "2012-03-16" "2012-03-23" "2012-03-30" "2012-04-06"
## [11] "2012-04-13" "2012-04-20" "2012-04-27" "2012-05-04" "2012-05-11"
## [16] "2012-05-18" "2012-05-25" "2012-06-01" "2012-06-08" "2012-06-15"
## [21] "2012-06-22" "2012-06-29" "2012-07-06" "2012-07-13" "2012-07-20"
## [26] "2012-07-27" "2012-08-03" "2012-08-10" "2012-08-17" "2012-08-24"
## [31] "2012-08-31" "2012-09-07" "2012-09-14" "2012-09-21" "2012-09-28"
## [36] "2012-10-05" "2012-10-12" "2012-10-19" "2012-10-26"
#create wide form train data
trainwide<-dcast(train,Date~Store, fun.aggregate = sum, value.var = "Store_Weekly_Sales") %>% select(-"Date")
#xts train data
train_xts<-xts(trainwide,order.by = train_Date_index)
head(train_xts)
## 1 2 3 4 5 6 7
## 2010-02-05 1643691 2136989 461622.2 2135144 317173.1 1652635 496725.4
## 2010-02-12 1641957 2137810 420729.0 2188307 311825.7 1606284 524104.9
## 2010-02-19 1611968 2124452 421642.2 2049860 303447.6 1567138 506760.5
## 2010-02-26 1409728 1865097 407204.9 1925729 270281.6 1432953 496083.2
## 2010-03-05 1554807 1991013 415202.0 1971057 288855.7 1601349 491419.5
## 2010-03-12 1439542 1990484 384200.7 1894324 297293.6 1558621 480452.1
## 8 9 10 11 12 13 14
## 2010-02-05 1004137.1 549505.6 2193049 1528009 1100046.4 1967221 2623470
## 2010-02-12 994801.4 552677.5 2176029 1574684 1117863.3 2030933 1704219
## 2010-02-19 963960.4 511327.9 2113433 1503299 1095421.6 1970275 2204557
## 2010-02-26 847592.1 473773.3 2006775 1336405 1048617.2 1817850 2095592
## 2010-03-05 881503.9 507297.9 1987090 1426623 1077018.3 1939980 2237545
## 2010-03-12 860336.2 494145.8 1941346 1331883 985594.2 1840687 2156035
## 15 16 17 18 19 20 21
## 2010-02-05 652122.4 477409.3 789036.0 1205308 1507637 2401395 798593.9
## 2010-02-12 682447.1 472044.3 841951.9 1187881 1536550 2109108 809321.4
## 2010-02-19 660838.8 469868.7 800714.0 1150663 1515976 2161550 867283.2
## 2010-02-26 564883.2 443242.2 749549.6 1068157 1373270 1898194 749597.2
## 2010-03-05 605325.4 444181.8 783300.1 1179738 1495845 2119214 747444.3
## 2010-03-12 604173.6 445393.7 763961.8 1138800 1467889 2010975 712312.9
## 22 23 24 25 26 27 28
## 2010-02-05 1033017.4 1364722 1388726 677231.6 1034119.2 1874290 1672352
## 2010-02-12 1022571.2 1380892 1414107 583364.0 1015684.1 1745363 1558968
## 2010-02-19 988467.6 1319588 1385362 676260.7 999348.6 1945070 1491300
## 2010-02-26 899761.5 1198710 1158723 628516.6 855385.0 1390934 1542173
## 2010-03-05 1009201.2 1311176 1412387 665750.1 1005669.6 1313730 1608435
## 2010-03-12 967187.4 1408083 1309340 660620.0 963382.1 1925113 1326877
## 29 30 31 32 33 34 35
## 2010-02-05 538634.5 465108.5 1469252 1087616 274593.4 956229.0 1230614
## 2010-02-12 529672.9 497374.6 1543947 1123566 294882.8 994611.0 1168815
## 2010-02-19 542399.1 463513.3 1473387 1082559 296850.8 983963.1 1270659
## 2010-02-26 488417.6 472330.7 1344354 1053247 284052.8 905756.1 1020652
## 2010-03-05 535087.9 472591.1 1384871 1066567 291484.9 918295.8 1162610
## 2010-03-12 519042.5 468189.9 1366193 1093319 312161.0 921247.9 1150344
## 36 37 38 39 40 41 42
## 2010-02-05 467546.7 536006.7 358496.1 1230597 1001943.8 1086533.2 543384.0
## 2010-02-12 469563.7 529852.7 342214.9 1266229 955338.3 1075656.3 575710.0
## 2010-02-19 470281.0 510382.5 327237.9 1230592 916289.2 1052034.7 508794.9
## 2010-02-26 447519.4 513615.8 334222.7 1168582 863917.4 991941.7 491510.6
## 2010-03-05 480203.4 519255.7 372239.9 1266254 990152.3 1063557.5 554972.4
## 2010-03-12 441434.2 513015.3 342023.9 1244392 899352.4 1023997.7 588363.6
## 43 44 45
## 2010-02-05 647029.3 281091.0 890689.5
## 2010-02-12 682919.0 286857.1 656988.6
## 2010-02-19 658997.6 267956.3 841264.0
## 2010-02-26 618702.8 273079.1 741891.7
## 2010-03-05 658600.1 284617.3 777951.2
## 2010-03-12 645386.9 272190.8 765687.4
#create wide form test data
testwide<-dcast(test, Date~Store, fun.aggregate = sum, value.var = "Store_Weekly_Sales") %>%select(-"Date")
#create xts test data
test_xts<-xts(testwide,order.by = test_Date_index)
Build seasonal auto.arima model
#checking ACF and PACF
lapply(train_xts, function(x) acf(x))
## $`1`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.299 0.164 0.082 0.199 -0.189 -0.114 -0.029 -0.072 -0.077
## 70 77 84 91 98 105 112 119 126 133
## -0.013 -0.021 -0.029 0.003 -0.004 -0.017 0.030 0.014 -0.004 -0.041
## 140
## -0.018
##
## $`2`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.393 0.259 0.105 0.191 -0.145 -0.114 -0.024 -0.053 -0.100
## 70 77 84 91 98 105 112 119 126 133
## -0.061 -0.037 -0.059 -0.097 -0.076 -0.053 0.000 -0.020 -0.027 -0.061
## 140
## -0.057
##
## $`3`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.522 0.377 0.381 0.305 -0.016 -0.004 0.064 -0.034 -0.009
## 70 77 84 91 98 105 112 119 126 133
## 0.008 0.012 -0.038 -0.005 -0.073 -0.132 -0.096 -0.095 -0.140 -0.134
## 140
## -0.115
##
## $`4`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.415 0.291 0.210 0.345 0.029 0.041 0.096 0.077 0.063
## 70 77 84 91 98 105 112 119 126 133
## 0.047 0.065 0.025 0.014 0.028 0.040 0.024 0.035 0.037 0.022
## 140
## -0.029
##
## $`5`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.367 0.277 0.228 0.287 -0.084 -0.119 -0.085 -0.080 -0.062
## 70 77 84 91 98 105 112 119 126 133
## -0.077 -0.015 0.017 -0.029 -0.055 -0.047 0.003 -0.065 -0.065 -0.108
## 140
## -0.071
##
## $`6`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.455 0.281 0.139 0.166 -0.169 -0.183 -0.183 -0.218 -0.235
## 70 77 84 91 98 105 112 119 126 133
## -0.198 -0.151 -0.135 -0.164 -0.133 -0.060 -0.003 -0.013 0.009 0.002
## 140
## 0.033
##
## $`7`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.638 0.514 0.368 0.391 0.170 0.053 -0.048 -0.130 -0.146
## 70 77 84 91 98 105 112 119 126 133
## -0.165 -0.143 -0.158 -0.134 -0.118 -0.056 -0.017 -0.001 0.037 0.064
## 140
## 0.138
##
## $`8`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.367 0.251 0.133 0.189 -0.191 -0.086 -0.017 0.001 -0.023
## 70 77 84 91 98 105 112 119 126 133
## -0.009 -0.018 -0.050 -0.069 -0.059 -0.025 0.035 0.011 -0.046 -0.050
## 140
## -0.095
##
## $`9`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.428 0.297 0.206 0.253 -0.113 -0.054 -0.019 0.000 -0.057
## 70 77 84 91 98 105 112 119 126 133
## -0.042 -0.013 -0.017 -0.077 -0.060 -0.019 0.024 -0.011 -0.026 -0.032
## 140
## -0.028
##
## $`10`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.456 0.310 0.181 0.212 -0.125 -0.095 -0.063 -0.093 -0.101
## 70 77 84 91 98 105 112 119 126 133
## -0.074 -0.055 -0.085 -0.101 -0.056 -0.038 -0.028 -0.005 0.040 0.007
## 140
## -0.076
##
## $`11`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.407 0.223 0.079 0.118 -0.233 -0.134 -0.031 -0.056 -0.081
## 70 77 84 91 98 105 112 119 126 133
## -0.041 -0.039 -0.081 -0.101 -0.060 -0.041 0.029 0.058 0.106 0.020
## 140
## -0.041
##
## $`12`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.364 0.243 0.117 0.199 -0.198 -0.120 -0.102 -0.101 -0.118
## 70 77 84 91 98 105 112 119 126 133
## -0.046 -0.053 -0.002 -0.049 -0.039 -0.021 0.081 0.004 -0.037 -0.079
## 140
## -0.101
##
## $`13`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.399 0.262 0.090 0.189 -0.166 -0.103 -0.083 -0.073 -0.110
## 70 77 84 91 98 105 112 119 126 133
## -0.061 -0.079 -0.115 -0.125 -0.079 -0.070 0.013 0.047 0.058 0.012
## 140
## 0.020
##
## $`14`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.264 0.262 0.042 0.143 -0.236 -0.102 -0.096 -0.067 -0.098
## 70 77 84 91 98 105 112 119 126 133
## -0.059 -0.014 -0.135 -0.036 -0.062 0.029 -0.034 0.061 -0.015 -0.004
## 140
## 0.003
##
## $`15`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.408 0.234 0.113 0.157 -0.225 -0.194 -0.166 -0.132 -0.158
## 70 77 84 91 98 105 112 119 126 133
## -0.134 -0.111 -0.108 -0.132 -0.095 -0.038 -0.016 -0.020 -0.058 -0.029
## 140
## 0.003
##
## $`16`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.597 0.430 0.259 0.269 0.050 -0.008 -0.068 -0.065 -0.067
## 70 77 84 91 98 105 112 119 126 133
## -0.129 -0.166 -0.163 -0.137 -0.122 -0.098 -0.079 -0.122 -0.143 -0.091
## 140
## -0.031
##
## $`17`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.148 0.321 0.161 0.156 -0.118 0.106 -0.003 0.026 -0.028
## 70 77 84 91 98 105 112 119 126 133
## 0.095 0.095 0.004 0.022 0.062 0.143 -0.180 0.092 -0.099 -0.105
## 140
## 0.033
##
## $`18`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.435 0.337 0.161 0.189 -0.156 -0.068 -0.026 -0.028 -0.067
## 70 77 84 91 98 105 112 119 126 133
## -0.048 -0.050 -0.165 -0.094 -0.120 -0.053 -0.027 0.081 0.003 0.003
## 140
## 0.010
##
## $`19`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.353 0.180 -0.004 0.066 -0.215 -0.142 -0.110 -0.078 -0.097
## 70 77 84 91 98 105 112 119 126 133
## -0.065 -0.008 -0.048 -0.060 -0.018 0.076 0.066 0.025 -0.089 -0.081
## 140
## -0.045
##
## $`20`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.396 0.279 0.126 0.155 -0.161 -0.055 -0.043 -0.048 -0.079
## 70 77 84 91 98 105 112 119 126 133
## -0.033 -0.053 -0.099 -0.126 -0.093 -0.079 -0.055 -0.018 -0.067 -0.034
## 140
## -0.069
##
## $`21`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.385 0.151 0.069 0.176 -0.200 -0.264 -0.190 -0.107 -0.132
## 70 77 84 91 98 105 112 119 126 133
## -0.136 -0.126 -0.072 -0.038 -0.026 -0.026 0.054 0.128 0.115 -0.002
## 140
## -0.048
##
## $`22`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.420 0.270 0.121 0.136 -0.259 -0.160 -0.110 -0.093 -0.178
## 70 77 84 91 98 105 112 119 126 133
## -0.110 -0.111 -0.157 -0.114 -0.104 -0.065 -0.017 0.060 -0.035 -0.050
## 140
## -0.032
##
## $`23`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.518 0.312 0.112 0.068 -0.221 -0.192 -0.184 -0.140 -0.152
## 70 77 84 91 98 105 112 119 126 133
## -0.133 -0.120 -0.136 -0.099 -0.081 -0.023 0.040 0.068 -0.019 -0.035
## 140
## -0.052
##
## $`24`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.302 0.161 0.009 0.134 -0.181 -0.129 -0.091 -0.091 -0.092
## 70 77 84 91 98 105 112 119 126 133
## -0.067 -0.043 -0.167 -0.124 -0.083 0.000 -0.107 0.002 -0.013 -0.016
## 140
## 0.043
##
## $`25`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.550 0.369 0.239 0.188 -0.138 -0.152 -0.148 -0.159 -0.199
## 70 77 84 91 98 105 112 119 126 133
## -0.178 -0.160 -0.170 -0.179 -0.136 -0.109 -0.063 -0.007 -0.041 -0.045
## 140
## -0.031
##
## $`26`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.331 0.258 0.024 0.161 -0.173 -0.117 -0.091 -0.056 -0.049
## 70 77 84 91 98 105 112 119 126 133
## -0.073 -0.005 -0.099 -0.061 -0.093 -0.016 -0.041 0.012 -0.033 -0.009
## 140
## -0.008
##
## $`27`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.311 0.223 0.086 0.154 -0.235 -0.126 -0.153 -0.118 -0.140
## 70 77 84 91 98 105 112 119 126 133
## -0.105 -0.068 -0.136 -0.094 -0.053 0.019 -0.033 0.042 -0.017 -0.035
## 140
## 0.026
##
## $`28`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.102 0.101 0.024 0.338 -0.147 -0.127 -0.109 -0.059 0.073
## 70 77 84 91 98 105 112 119 126 133
## -0.062 -0.098 -0.034 0.152 -0.042 -0.136 0.007 0.064 0.028 -0.143
## 140
## -0.094
##
## $`29`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.375 0.199 0.147 0.195 -0.234 -0.176 -0.087 -0.120 -0.172
## 70 77 84 91 98 105 112 119 126 133
## -0.108 -0.093 -0.125 -0.099 -0.117 -0.085 -0.026 0.007 -0.062 -0.084
## 140
## -0.030
##
## $`30`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63 70 77
## 1.000 0.409 0.447 0.486 0.524 0.231 0.316 0.464 0.210 0.262 0.267 0.272
## 84 91 98 105 112 119 126 133 140
## 0.130 0.225 0.194 0.171 0.161 0.160 0.142 0.115 0.106
##
## $`31`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.267 0.179 0.144 0.366 -0.094 -0.051 0.025 0.007 -0.029
## 70 77 84 91 98 105 112 119 126 133
## -0.014 0.015 -0.026 0.012 0.015 0.002 0.030 0.051 0.054 -0.007
## 140
## -0.032
##
## $`32`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.366 0.254 0.129 0.266 -0.131 -0.090 -0.084 -0.028 -0.090
## 70 77 84 91 98 105 112 119 126 133
## -0.063 -0.086 -0.079 -0.089 -0.074 -0.061 -0.005 0.044 0.039 0.031
## 140
## -0.008
##
## $`33`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.520 0.217 0.318 0.773 0.585 0.180 0.157 0.493 0.619
## 70 77 84 91 98 105 112 119 126 133
## 0.197 0.030 0.183 0.500 0.192 -0.091 -0.081 0.261 0.202 -0.131
## 140
## -0.211
##
## $`34`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.303 0.214 0.123 0.288 -0.102 -0.056 0.003 -0.016 -0.048
## 70 77 84 91 98 105 112 119 126 133
## -0.063 -0.033 -0.045 -0.089 -0.060 -0.022 -0.009 -0.048 -0.018 -0.023
## 140
## -0.051
##
## $`35`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63 70 77
## 1.000 0.471 0.371 0.307 0.419 0.084 0.080 0.097 0.103 0.075 0.135 0.196
## 84 91 98 105 112 119 126 133 140
## 0.146 0.129 0.156 0.196 0.179 0.163 0.163 0.155 0.158
##
## $`36`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63 70 77
## 1.000 0.854 0.778 0.771 0.844 0.766 0.679 0.652 0.707 0.707 0.625 0.548
## 84 91 98 105 112 119 126 133 140
## 0.575 0.602 0.523 0.441 0.427 0.466 0.423 0.344 0.304
##
## $`37`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 -0.011 0.186 0.155 0.218 0.057 0.109 0.121 0.037 0.070
## 70 77 84 91 98 105 112 119 126 133
## 0.042 0.185 -0.073 0.018 -0.012 0.024 0.001 -0.035 -0.124 0.004
## 140
## -0.072
##
## $`38`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63 70 77
## 1.000 0.420 0.418 0.405 0.658 0.558 0.346 0.342 0.419 0.628 0.311 0.279
## 84 91 98 105 112 119 126 133 140
## 0.281 0.574 0.332 0.193 0.213 0.355 0.362 0.184 0.105
##
## $`39`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.452 0.339 0.253 0.362 -0.014 -0.019 -0.023 -0.044 -0.073
## 70 77 84 91 98 105 112 119 126 133
## -0.040 -0.030 -0.044 -0.021 0.022 0.038 0.084 0.140 0.125 0.065
## 140
## 0.039
##
## $`40`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.299 0.247 0.006 0.089 -0.221 -0.113 -0.095 -0.059 -0.031
## 70 77 84 91 98 105 112 119 126 133
## -0.048 -0.051 -0.138 -0.084 -0.124 -0.043 -0.055 0.022 -0.008 -0.003
## 140
## 0.048
##
## $`41`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.371 0.321 0.182 0.318 -0.082 -0.007 -0.016 -0.032 -0.069
## 70 77 84 91 98 105 112 119 126 133
## -0.014 -0.016 -0.062 -0.066 0.010 0.029 0.053 0.076 0.115 0.064
## 140
## 0.073
##
## $`42`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.134 -0.519 -0.170 0.720 0.444 -0.342 -0.393 0.338 0.687
## 70 77 84 91 98 105 112 119 126 133
## -0.093 -0.517 -0.016 0.690 0.168 -0.468 -0.261 0.476 0.373 -0.284
## 140
## -0.433
##
## $`43`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.372 0.201 0.317 0.649 0.379 0.175 0.191 0.307 0.445
## 70 77 84 91 98 105 112 119 126 133
## 0.155 0.055 0.039 0.329 0.191 -0.052 -0.054 0.184 0.209 -0.104
## 140
## -0.063
##
## $`44`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63 70 77
## 1.000 0.106 0.301 0.275 0.398 0.210 0.214 0.233 0.242 0.101 0.129 0.221
## 84 91 98 105 112 119 126 133 140
## 0.169 0.143 0.088 0.138 0.190 0.114 0.019 0.133 0.163
##
## $`45`
##
## Autocorrelations of series 'x', by lag
##
## 0 7 14 21 28 35 42 49 56 63
## 1.000 0.378 0.234 0.103 0.160 -0.191 -0.113 -0.087 -0.083 -0.120
## 70 77 84 91 98 105 112 119 126 133
## -0.083 -0.098 -0.122 -0.114 -0.121 -0.094 -0.050 -0.040 -0.078 -0.055
## 140
## -0.032
lapply(train_xts, function(x) pacf(x))
## $`1`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.299 0.082 0.014 0.177 -0.338 -0.005 0.072 -0.133 0.118 -0.040
## 77 84 91 98 105 112 119 126 133 140
## -0.091 0.084 -0.056 -0.015 0.043 -0.014 0.002 0.007 -0.081 0.002
##
## $`2`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.393 0.123 -0.039 0.162 -0.326 -0.012 0.145 -0.142 0.045 -0.024
## 77 84 91 98 105 112 119 126 133 140
## -0.102 0.064 -0.101 -0.047 0.055 0.002 -0.012 -0.038 -0.102 -0.018
##
## $`3`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.522 0.143 0.192 0.033 -0.361 0.017 0.117 -0.015 0.149 -0.119
## 77 84 91 98 105 112 119 126 133 140
## -0.047 -0.011 -0.013 -0.048 -0.093 0.020 0.004 -0.035 -0.024 -0.106
##
## $`4`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.415 0.143 0.059 0.262 -0.270 0.018 0.113 -0.093 0.163 -0.050
## 77 84 91 98 105 112 119 126 133 140
## -0.040 0.055 -0.093 0.079 0.029 -0.046 0.085 -0.056 -0.010 -0.023
##
## $`5`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.367 0.165 0.098 0.177 -0.328 -0.139 0.001 -0.010 0.175 -0.032
## 77 84 91 98 105 112 119 126 133 140
## -0.028 0.011 -0.134 -0.013 -0.018 0.053 0.011 -0.052 -0.115 -0.064
##
## $`6`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.455 0.093 -0.025 0.116 -0.365 -0.031 -0.009 -0.171 0.046 -0.109
## 77 84 91 98 105 112 119 126 133 140
## -0.064 -0.001 -0.206 -0.034 0.003 -0.049 -0.007 -0.090 -0.120 0.010
##
## $`7`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.638 0.181 -0.028 0.200 -0.282 -0.120 -0.029 -0.181 0.124 0.009
## 77 84 91 98 105 112 119 126 133 140
## 0.010 0.030 -0.079 -0.011 0.046 0.012 0.010 0.054 -0.006 0.111
##
## $`8`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.367 0.134 0.005 0.132 -0.366 0.050 0.109 -0.029 0.109 -0.136
## 77 84 91 98 105 112 119 126 133 140
## -0.058 0.017 -0.068 0.056 0.008 0.044 -0.012 -0.136 -0.016 -0.094
##
## $`9`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.428 0.139 0.046 0.153 -0.369 0.051 0.065 -0.012 0.084 -0.132
## 77 84 91 98 105 112 119 126 133 140
## 0.008 0.018 -0.097 0.053 -0.008 0.059 0.006 -0.103 -0.018 -0.025
##
## $`10`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.456 0.129 -0.001 0.129 -0.355 0.032 0.067 -0.117 0.112 -0.099
## 77 84 91 98 105 112 119 126 133 140
## -0.044 0.018 -0.135 0.087 -0.007 -0.034 0.077 -0.048 -0.029 -0.108
##
## $`11`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.407 0.068 -0.040 0.103 -0.377 0.088 0.113 -0.147 0.093 -0.122
## 77 84 91 98 105 112 119 126 133 140
## -0.070 0.063 -0.154 0.054 0.006 0.008 0.097 -0.019 -0.091 -0.061
##
## $`12`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.364 0.127 -0.008 0.157 -0.381 0.015 0.029 -0.108 0.118 -0.081
## 77 84 91 98 105 112 119 126 133 140
## -0.061 0.102 -0.159 0.014 0.049 0.026 0.022 -0.120 -0.108 -0.080
##
## $`13`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.399 0.122 -0.061 0.174 -0.354 0.033 0.063 -0.136 0.097 -0.081
## 77 84 91 98 105 112 119 126 133 140
## -0.095 -0.007 -0.114 0.029 -0.005 0.053 0.046 -0.056 -0.026 -0.038
##
## $`14`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.264 0.206 -0.076 0.107 -0.320 -0.031 0.074 -0.069 0.027 -0.091
## 77 84 91 98 105 112 119 126 133 140
## -0.005 -0.129 0.017 -0.008 0.016 0.012 -0.025 -0.042 -0.055 0.048
##
## $`15`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.408 0.081 -0.010 0.119 -0.402 -0.005 -0.003 -0.077 0.073 -0.161
## 77 84 91 98 105 112 119 126 133 140
## -0.064 -0.048 -0.142 0.034 -0.026 -0.042 -0.021 -0.191 -0.021 0.004
##
## $`16`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.597 0.114 -0.058 0.162 -0.264 -0.008 0.000 -0.054 0.088 -0.160
## 77 84 91 98 105 112 119 126 133 140
## -0.061 0.007 -0.053 0.059 -0.013 -0.035 -0.114 -0.087 0.096 0.040
##
## $`17`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.148 0.305 0.093 0.040 -0.238 0.071 0.075 0.021 -0.044 0.039
## 77 84 91 98 105 112 119 126 133 140
## 0.148 -0.058 -0.073 0.024 0.220 -0.250 -0.026 -0.047 -0.045 0.227
##
## $`18`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.435 0.182 -0.049 0.108 -0.352 0.066 0.134 -0.081 0.051 -0.123
## 77 84 91 98 105 112 119 126 133 140
## -0.048 -0.097 0.041 -0.027 0.015 0.081 0.003 -0.066 -0.046 0.017
##
## $`19`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.353 0.063 -0.098 0.098 -0.291 0.003 0.016 -0.092 0.011 -0.077
## 77 84 91 98 105 112 119 126 133 140
## 0.013 -0.060 -0.069 0.032 0.053 0.022 -0.043 -0.162 -0.035 0.040
##
## $`20`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.396 0.145 -0.031 0.098 -0.309 0.075 0.053 -0.071 0.039 -0.073
## 77 84 91 98 105 112 119 126 133 140
## -0.031 -0.047 -0.094 0.011 -0.012 0.001 0.020 -0.132 0.023 -0.064
##
## $`21`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.385 0.004 0.012 0.170 -0.392 -0.093 -0.004 -0.079 0.076 -0.108
## 77 84 91 98 105 112 119 126 133 140
## -0.151 -0.015 -0.053 -0.002 -0.013 0.003 0.064 -0.008 -0.122 -0.095
##
## $`22`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.420 0.113 -0.033 0.086 -0.425 0.075 0.067 -0.082 -0.008 -0.157
## 77 84 91 98 105 112 119 126 133 140
## -0.061 -0.065 0.010 -0.082 -0.035 0.039 -0.012 -0.127 -0.101 -0.031
##
## $`23`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.518 0.061 -0.099 0.041 -0.347 0.061 -0.005 -0.061 0.005 -0.152
## 77 84 91 98 105 112 119 126 133 140
## -0.030 -0.093 -0.008 -0.022 -0.020 0.065 -0.063 -0.144 -0.037 -0.063
##
## $`24`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.302 0.077 -0.066 0.150 -0.283 -0.033 0.031 -0.126 0.051 -0.062
## 77 84 91 98 105 112 119 126 133 140
## -0.056 -0.142 -0.077 -0.009 0.011 -0.109 0.015 -0.072 -0.091 0.116
##
## $`25`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.550 0.096 0.003 0.048 -0.394 0.047 0.019 -0.061 0.025 -0.149
## 77 84 91 98 105 112 119 126 133 140
## -0.045 -0.051 -0.091 0.030 -0.061 0.014 0.034 -0.188 -0.016 -0.046
##
## $`26`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.331 0.167 -0.118 0.165 -0.289 -0.054 0.097 -0.104 0.088 -0.093
## 77 84 91 98 105 112 119 126 133 140
## -0.016 -0.068 -0.062 0.010 -0.014 0.011 0.003 -0.078 -0.009 0.000
##
## $`27`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.311 0.139 -0.019 0.118 -0.359 -0.006 -0.031 -0.068 0.053 -0.139
## 77 84 91 98 105 112 119 126 133 140
## 0.004 -0.147 -0.057 0.035 -0.012 -0.011 -0.022 -0.111 -0.083 0.079
##
## $`28`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.102 0.091 0.006 0.332 -0.241 -0.160 -0.054 -0.160 0.308 -0.032
## 77 84 91 98 105 112 119 126 133 140
## -0.150 0.048 -0.046 0.020 -0.058 -0.004 0.033 0.029 -0.105 -0.141
##
## $`29`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.375 0.069 0.061 0.132 -0.428 0.011 0.042 -0.108 0.082 -0.142
## 77 84 91 98 105 112 119 126 133 140
## -0.097 -0.004 -0.082 -0.076 -0.010 -0.002 -0.014 -0.105 -0.151 -0.024
##
## $`30`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.409 0.336 0.308 0.305 -0.219 -0.061 0.274 -0.130 0.082 -0.045
## 77 84 91 98 105 112 119 126 133 140
## -0.057 0.055 0.005 -0.033 0.099 -0.014 -0.028 0.002 -0.024 -0.038
##
## $`31`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.267 0.116 0.077 0.324 -0.324 -0.046 0.068 -0.137 0.206 -0.048
## 77 84 91 98 105 112 119 126 133 140
## -0.066 0.070 -0.086 0.070 0.015 0.003 0.073 -0.018 -0.037 -0.071
##
## $`32`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.366 0.139 -0.003 0.224 -0.363 -0.013 0.034 -0.061 0.111 -0.092
## 77 84 91 98 105 112 119 126 133 140
## -0.088 -0.011 -0.068 0.020 0.011 0.020 0.068 -0.032 -0.013 -0.073
##
## $`33`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.520 -0.073 0.324 0.731 -0.062 -0.292 0.066 -0.137 0.273 -0.177
## 77 84 91 98 105 112 119 126 133 140
## -0.116 -0.175 0.038 -0.147 -0.069 -0.063 -0.073 0.069 0.015 0.063
##
## $`34`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.303 0.134 0.029 0.247 -0.303 -0.031 0.085 -0.112 0.122 -0.088
## 77 84 91 98 105 112 119 126 133 140
## -0.066 0.063 -0.150 0.062 0.031 -0.057 0.050 -0.065 -0.035 -0.005
##
## $`35`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.471 0.191 0.101 0.275 -0.321 -0.006 0.069 -0.047 0.197 0.060
## 77 84 91 98 105 112 119 126 133 140
## 0.075 0.000 -0.074 0.054 0.073 0.100 0.054 -0.013 -0.020 0.015
##
## $`36`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.854 0.181 0.275 0.483 -0.258 -0.158 0.028 0.111 0.190 -0.118
## 77 84 91 98 105 112 119 126 133 140
## -0.241 0.045 0.046 -0.152 0.054 -0.092 -0.003 0.018 -0.078 0.027
##
## $`37`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## -0.011 0.186 0.165 0.204 0.023 0.020 0.055 -0.029 0.009 -0.008
## 77 84 91 98 105 112 119 126 133 140
## 0.152 -0.086 -0.078 -0.063 -0.017 0.041 -0.033 -0.153 0.008 -0.034
##
## $`38`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.420 0.293 0.209 0.525 0.334 -0.091 -0.117 -0.148 0.286 -0.047
## 77 84 91 98 105 112 119 126 133 140
## -0.078 -0.109 0.210 -0.062 -0.144 -0.048 -0.025 -0.059 0.082 -0.073
##
## $`39`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.452 0.169 0.064 0.249 -0.375 -0.019 0.029 -0.126 0.197 -0.049
## 77 84 91 98 105 112 119 126 133 140
## -0.044 0.053 -0.100 0.105 0.046 0.050 0.153 -0.092 -0.059 -0.044
##
## $`40`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.299 0.173 -0.122 0.085 -0.274 -0.021 0.064 -0.073 0.070 -0.102
## 77 84 91 98 105 112 119 126 133 140
## -0.071 -0.097 -0.045 -0.035 -0.002 -0.023 -0.015 -0.038 -0.064 0.068
##
## $`41`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.371 0.213 0.011 0.236 -0.349 0.001 0.073 -0.129 0.152 -0.046
## 77 84 91 98 105 112 119 126 133 140
## -0.050 0.016 -0.119 0.126 0.055 0.029 0.098 -0.047 -0.021 0.034
##
## $`42`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.134 -0.547 0.015 0.681 0.291 0.070 -0.045 -0.192 0.356 -0.108
## 77 84 91 98 105 112 119 126 133 140
## -0.083 -0.058 0.033 -0.101 0.003 0.042 -0.015 -0.128 0.029 -0.075
##
## $`43`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.372 0.073 0.257 0.566 0.072 -0.057 -0.116 -0.220 0.307 -0.063
## 77 84 91 98 105 112 119 126 133 140
## -0.081 -0.238 -0.020 0.170 -0.062 0.055 -0.008 -0.021 -0.077 0.061
##
## $`44`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.106 0.293 0.245 0.340 0.109 -0.001 0.011 0.029 -0.094 -0.081
## 77 84 91 98 105 112 119 126 133 140
## 0.081 0.088 0.090 -0.025 -0.050 0.073 0.038 -0.129 -0.040 0.093
##
## $`45`
##
## Partial autocorrelations of series 'x', by lag
##
## 7 14 21 28 35 42 49 56 63 70
## 0.378 0.106 -0.019 0.125 -0.346 0.034 0.030 -0.087 0.049 -0.108
## 77 84 91 98 105 112 119 126 133 140
## -0.072 -0.038 -0.072 -0.051 -0.022 -0.006 -0.047 -0.092 -0.047 -0.034
#fitting auto arima model
train_ts_fit<-lapply(train_xts, function(x) auto.arima(x, lambda = 0))
#validate test set by MAE, MAPE, RMSE
fcast<-list()
sum_MAE<-0
sum_MAPE<-0
sum_RMSE<-0
for (i in c(colnames(train_xts))){
fcast[[i]]<- forecast(auto.arima(train_xts[,i]), h = 39)$mean
fcast.num<-as.numeric(fcast[[i]])
test.num<-as.numeric(test_xts[,i])
sum_MAE<-mean(abs(test.num-fcast.num))+sum_MAE
sum_MAPE<-100*mean(abs(test.num-fcast.num)/abs(test.num))+sum_MAPE
sum_RMSE<-sqrt(mean((test.num-fcast.num)^2))+sum_RMSE
}
#divide by 45 number of stores to get average model statistics
print(paste("arima model MAE is", sum_MAE/45))
## [1] "arima model MAE is 64933.6014515565"
print(paste("arima model MAPE is", sum_MAPE/45))
## [1] "arima model MAPE is 6.17079873690773"
print(paste("arima model RMSE is", sum_RMSE/45))
## [1] "arima model RMSE is 79707.2952189233"
Random Forst model using “ranger”
#build random forest model on training set
train_rf<-data_clean %>% filter(Date <"2012-02-03") %>%
group_by (Date, Store,IsHoliday, Week, Type ) %>% summarize (Store_Weekly_Sales = sum(Weekly_Sales))
test_rf<-data_clean %>% filter(Date >="2012-02-03") %>%
group_by (Date, Store,IsHoliday, Week, Type ) %>% summarize (Store_Weekly_Sales = sum(Weekly_Sales))
train_rf
## # A tibble: 4,680 x 6
## # Groups: Date, Store, IsHoliday, Week [4,680]
## Date Store IsHoliday Week Type Store_Weekly_Sales
## <date> <fct> <lgl> <dbl> <fct> <dbl>
## 1 2010-02-05 1 FALSE 5 A 1643691.
## 2 2010-02-05 2 FALSE 5 A 2136989.
## 3 2010-02-05 3 FALSE 5 B 461622.
## 4 2010-02-05 4 FALSE 5 A 2135144.
## 5 2010-02-05 5 FALSE 5 B 317173.
## 6 2010-02-05 6 FALSE 5 A 1652635.
## 7 2010-02-05 7 FALSE 5 B 496725.
## 8 2010-02-05 8 FALSE 5 A 1004137.
## 9 2010-02-05 9 FALSE 5 B 549506.
## 10 2010-02-05 10 FALSE 5 B 2193049.
## # … with 4,670 more rows
test_rf
## # A tibble: 1,755 x 6
## # Groups: Date, Store, IsHoliday, Week [1,755]
## Date Store IsHoliday Week Type Store_Weekly_Sales
## <date> <fct> <lgl> <dbl> <fct> <dbl>
## 1 2012-02-03 1 FALSE 5 A 1636340.
## 2 2012-02-03 2 FALSE 5 A 1935300.
## 3 2012-02-03 3 FALSE 5 B 424961.
## 4 2012-02-03 4 FALSE 5 A 2173374.
## 5 2012-02-03 5 FALSE 5 B 333948
## 6 2012-02-03 6 FALSE 5 A 1496306.
## 7 2012-02-03 7 FALSE 5 B 580453.
## 8 2012-02-03 8 FALSE 5 A 927611.
## 9 2012-02-03 9 FALSE 5 B 549968.
## 10 2012-02-03 10 FALSE 5 B 1867403.
## # … with 1,745 more rows
seed<-set.seed(10)
outcome<-"train_rf$Store_Weekly_Sales"
vars<-c("Store", "Week", "Type", "IsHoliday")
fmla<-paste(outcome, "~", paste(vars, collapse = "+"))
fit_rf<-ranger(fmla,train_rf, num.trees = 500, respect.unordered.factors = "order", seed = seed)
#predict on test set
test_rf$pred<-predict(fit_rf, test_rf)$predictions
#calculate prediction RMSE
test_rf %>% mutate(residual = Store_Weekly_Sales-pred) %>%
summarize(RMSE = (mean(residual^2))^0.5) %>%
magrittr::use_series(RMSE) %>% mean()
## [1] 68906.49
#plot prediction
ggplot(test_rf, aes(x= pred, y=Store_Weekly_Sales))+
geom_point()+
geom_line(aes(color = factor(Store)))+
geom_abline()
XGBoost Tree Model
#data prep using "vtreat"
#create one hot encoding plan
treatplan<-designTreatmentsZ(train_rf, vars, verbose = F)
#querry new varName
newvars<-treatplan %>%
magrittr::use_series(scoreFrame) %>%
filter(code %in% c("clean", "lev")) %>%
magrittr::use_series(varName)
#prepare train/test set
train_rf_treat<-prepare(treatplan, train_rf, varRestriction = newvars)
test_rf_treat<-prepare(treatplan, test_rf, varRestriction = newvars)
str(train_rf_treat)
## 'data.frame': 4680 obs. of 50 variables:
## $ Week : num 5 5 5 5 5 5 5 5 5 5 ...
## $ IsHoliday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_1 : num 1 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_10: num 0 0 0 0 0 0 0 0 0 1 ...
## $ Store_lev_x_11: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_12: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_13: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_14: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_15: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_16: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_17: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_18: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_19: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_2 : num 0 1 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_20: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_21: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_22: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_23: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_24: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_25: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_26: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_27: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_28: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_29: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_3 : num 0 0 1 0 0 0 0 0 0 0 ...
## $ Store_lev_x_30: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_31: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_32: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_33: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_34: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_35: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_36: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_37: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_38: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_39: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_4 : num 0 0 0 1 0 0 0 0 0 0 ...
## $ Store_lev_x_40: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_41: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_42: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_43: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_44: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_45: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_5 : num 0 0 0 0 1 0 0 0 0 0 ...
## $ Store_lev_x_6 : num 0 0 0 0 0 1 0 0 0 0 ...
## $ Store_lev_x_7 : num 0 0 0 0 0 0 1 0 0 0 ...
## $ Store_lev_x_8 : num 0 0 0 0 0 0 0 1 0 0 ...
## $ Store_lev_x_9 : num 0 0 0 0 0 0 0 0 1 0 ...
## $ Type_lev_x_A : num 1 1 0 1 0 1 0 1 0 0 ...
## $ Type_lev_x_B : num 0 0 1 0 1 0 1 0 1 1 ...
## $ Type_lev_x_C : num 0 0 0 0 0 0 0 0 0 0 ...
str(test_rf_treat)
## 'data.frame': 1755 obs. of 50 variables:
## $ Week : num 5 5 5 5 5 5 5 5 5 5 ...
## $ IsHoliday : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_1 : num 1 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_10: num 0 0 0 0 0 0 0 0 0 1 ...
## $ Store_lev_x_11: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_12: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_13: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_14: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_15: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_16: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_17: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_18: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_19: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_2 : num 0 1 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_20: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_21: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_22: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_23: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_24: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_25: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_26: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_27: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_28: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_29: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_3 : num 0 0 1 0 0 0 0 0 0 0 ...
## $ Store_lev_x_30: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_31: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_32: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_33: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_34: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_35: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_36: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_37: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_38: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_39: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_4 : num 0 0 0 1 0 0 0 0 0 0 ...
## $ Store_lev_x_40: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_41: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_42: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_43: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_44: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_45: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Store_lev_x_5 : num 0 0 0 0 1 0 0 0 0 0 ...
## $ Store_lev_x_6 : num 0 0 0 0 0 1 0 0 0 0 ...
## $ Store_lev_x_7 : num 0 0 0 0 0 0 1 0 0 0 ...
## $ Store_lev_x_8 : num 0 0 0 0 0 0 0 1 0 0 ...
## $ Store_lev_x_9 : num 0 0 0 0 0 0 0 0 1 0 ...
## $ Type_lev_x_A : num 1 1 0 1 0 1 0 1 0 0 ...
## $ Type_lev_x_B : num 0 0 1 0 1 0 1 0 1 1 ...
## $ Type_lev_x_C : num 0 0 0 0 0 0 0 0 0 0 ...
#xgboost cross validation on training set
cv<-xgb.cv(data=as.matrix(train_rf_treat), label = train_rf$Store_Weekly_Sales,
nrounds = 100, nfold = 5, max_depth = 6, eta = 0.3,
objective = "reg:linear", early_stopping_rounds = 10)
## [04:13:36] WARNING: amalgamation/../src/objective/regression_obj.cu:170: reg:linear is now deprecated in favor of reg:squarederror.
## [04:13:36] WARNING: amalgamation/../src/objective/regression_obj.cu:170: reg:linear is now deprecated in favor of reg:squarederror.
## [04:13:37] WARNING: amalgamation/../src/objective/regression_obj.cu:170: reg:linear is now deprecated in favor of reg:squarederror.
## [04:13:37] WARNING: amalgamation/../src/objective/regression_obj.cu:170: reg:linear is now deprecated in favor of reg:squarederror.
## [04:13:37] WARNING: amalgamation/../src/objective/regression_obj.cu:170: reg:linear is now deprecated in favor of reg:squarederror.
## [1] train-rmse:862282.400000+1628.260279 test-rmse:862886.512500+8568.557350
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
##
## [2] train-rmse:632992.587500+918.194840 test-rmse:633014.975000+6262.622753
## [3] train-rmse:475023.656250+1634.537846 test-rmse:475896.987500+4848.590787
## [4] train-rmse:366171.556250+1973.696671 test-rmse:367492.712500+3941.187009
## [5] train-rmse:291985.431250+2006.600549 test-rmse:296671.537500+3891.661203
## [6] train-rmse:242186.887500+2610.504242 test-rmse:247743.387500+6401.768745
## [7] train-rmse:208834.465625+3724.237255 test-rmse:214529.840625+6333.118730
## [8] train-rmse:185239.771875+2840.337732 test-rmse:192620.159375+5671.530014
## [9] train-rmse:169638.159375+3006.191626 test-rmse:178997.725000+6223.592634
## [10] train-rmse:157930.203125+2826.673997 test-rmse:168065.103125+6916.199015
## [11] train-rmse:149411.940625+1979.544857 test-rmse:159659.150000+6411.843533
## [12] train-rmse:141879.209375+2311.359854 test-rmse:152916.521875+6711.671487
## [13] train-rmse:134846.609375+1691.227277 test-rmse:146419.187500+6322.391126
## [14] train-rmse:129363.843750+1449.398559 test-rmse:140907.059375+5654.048145
## [15] train-rmse:123906.337500+934.256830 test-rmse:136369.140625+5528.793028
## [16] train-rmse:119780.826563+645.267979 test-rmse:132698.037500+5612.884757
## [17] train-rmse:116054.971875+830.468588 test-rmse:128652.629687+5836.304871
## [18] train-rmse:112775.440625+662.476747 test-rmse:125427.339063+5838.021699
## [19] train-rmse:109878.657812+996.205461 test-rmse:122789.784375+5738.262154
## [20] train-rmse:107163.754688+848.287801 test-rmse:120468.225000+5780.027733
## [21] train-rmse:104150.137500+875.559993 test-rmse:118221.356250+6097.211212
## [22] train-rmse:101359.656250+787.792734 test-rmse:115823.964063+5341.858740
## [23] train-rmse:99104.551563+912.913593 test-rmse:113674.670313+5226.754461
## [24] train-rmse:96867.920313+773.240069 test-rmse:112006.064063+5528.126017
## [25] train-rmse:94962.770313+836.687034 test-rmse:110209.670312+5482.852506
## [26] train-rmse:93242.723437+994.484044 test-rmse:108607.159375+5859.720974
## [27] train-rmse:91380.498437+1036.864386 test-rmse:107303.053125+5895.066634
## [28] train-rmse:89772.745312+725.712309 test-rmse:106014.596875+5975.810766
## [29] train-rmse:88329.606250+720.044322 test-rmse:104906.617187+6094.042997
## [30] train-rmse:87115.409375+1016.453762 test-rmse:103886.865625+5994.954717
## [31] train-rmse:85713.960938+934.971476 test-rmse:102918.553125+6304.505331
## [32] train-rmse:84374.720312+1021.587053 test-rmse:101995.021875+6413.165394
## [33] train-rmse:83339.801563+1116.223802 test-rmse:101350.792187+6304.864848
## [34] train-rmse:82492.592188+1129.354596 test-rmse:100690.175000+6378.735428
## [35] train-rmse:80791.412500+915.196162 test-rmse:99462.182813+6026.812450
## [36] train-rmse:79991.050000+998.655983 test-rmse:98938.531250+6088.102305
## [37] train-rmse:78671.965625+787.658270 test-rmse:97947.073438+6426.951933
## [38] train-rmse:77597.051562+499.147713 test-rmse:97447.595312+6503.626432
## [39] train-rmse:76868.248438+499.911856 test-rmse:97011.096875+6401.480049
## [40] train-rmse:76109.035937+247.855549 test-rmse:96359.103125+6744.507526
## [41] train-rmse:74961.470312+610.293253 test-rmse:95597.348438+6526.024821
## [42] train-rmse:74220.445313+625.010027 test-rmse:95290.378125+6628.321287
## [43] train-rmse:73527.096875+475.618166 test-rmse:94544.235937+6772.329539
## [44] train-rmse:72963.145313+681.199956 test-rmse:94134.421875+6800.539546
## [45] train-rmse:72208.664062+438.033154 test-rmse:93705.509375+7064.347556
## [46] train-rmse:71733.825000+492.477312 test-rmse:93357.445312+7073.152285
## [47] train-rmse:71016.917188+493.306064 test-rmse:92856.670313+7078.902913
## [48] train-rmse:70403.109375+450.133871 test-rmse:92495.037500+7348.722239
## [49] train-rmse:69985.170312+338.479199 test-rmse:92229.340625+7317.593437
## [50] train-rmse:69598.381250+492.044858 test-rmse:91947.001563+7434.438578
## [51] train-rmse:69101.707812+555.659864 test-rmse:91609.800000+7589.462651
## [52] train-rmse:68611.431250+777.814649 test-rmse:91399.239063+7601.262501
## [53] train-rmse:68094.806250+696.462149 test-rmse:91284.023438+7869.567901
## [54] train-rmse:67616.496875+701.551712 test-rmse:91076.304687+7922.174673
## [55] train-rmse:66831.207813+663.859158 test-rmse:90502.215625+7639.559211
## [56] train-rmse:66001.510938+416.502047 test-rmse:90157.378125+7476.217296
## [57] train-rmse:65639.016406+390.494078 test-rmse:90075.981250+7521.955946
## [58] train-rmse:65239.530469+525.946369 test-rmse:89927.685938+7505.828376
## [59] train-rmse:64905.942187+490.294556 test-rmse:89753.918750+7530.250128
## [60] train-rmse:64474.571875+577.513364 test-rmse:89771.654687+7562.762379
## [61] train-rmse:64041.322656+556.548944 test-rmse:89654.629688+7464.357590
## [62] train-rmse:63484.001562+783.000719 test-rmse:89407.540625+7433.533112
## [63] train-rmse:63083.953906+799.119386 test-rmse:89437.628125+7557.980095
## [64] train-rmse:62659.934375+720.194651 test-rmse:89347.151563+7512.773605
## [65] train-rmse:62279.526562+592.661522 test-rmse:89231.550000+7556.660948
## [66] train-rmse:61895.600781+542.158787 test-rmse:89002.201562+7543.509163
## [67] train-rmse:61664.750781+565.681365 test-rmse:89049.200000+7750.741594
## [68] train-rmse:61303.084375+727.927708 test-rmse:88899.898438+7856.735575
## [69] train-rmse:60860.826562+751.551689 test-rmse:88738.845313+7865.001934
## [70] train-rmse:60660.183594+750.422254 test-rmse:88615.209375+7892.156439
## [71] train-rmse:60385.171094+672.984342 test-rmse:88448.976563+7841.022052
## [72] train-rmse:60175.201563+722.654870 test-rmse:88480.340625+8000.503806
## [73] train-rmse:59772.202344+706.413314 test-rmse:88301.101562+8068.716341
## [74] train-rmse:59336.779687+802.249481 test-rmse:88203.181250+8082.382744
## [75] train-rmse:58985.623437+743.829010 test-rmse:88099.779688+8125.924915
## [76] train-rmse:58716.616406+829.660181 test-rmse:87996.409375+8017.077586
## [77] train-rmse:58457.553906+915.102583 test-rmse:88022.450000+8109.123650
## [78] train-rmse:58312.691406+896.885692 test-rmse:87945.446875+8075.271748
## [79] train-rmse:58054.892969+900.874118 test-rmse:87985.978125+8041.083585
## [80] train-rmse:57839.200000+873.792958 test-rmse:88031.834375+8228.601237
## [81] train-rmse:57603.865625+869.487147 test-rmse:87996.456250+8272.252042
## [82] train-rmse:57361.356250+908.993452 test-rmse:87910.346875+8193.713051
## [83] train-rmse:57140.927344+906.441383 test-rmse:87855.734375+8188.471435
## [84] train-rmse:56886.545312+814.019150 test-rmse:87838.896875+8262.783076
## [85] train-rmse:56634.233594+769.985160 test-rmse:87920.740625+8174.171314
## [86] train-rmse:56402.882813+851.411260 test-rmse:87849.432813+8172.347963
## [87] train-rmse:56252.852344+822.823719 test-rmse:87863.317187+8183.240970
## [88] train-rmse:56144.444531+813.858644 test-rmse:87816.032813+8171.278465
## [89] train-rmse:55891.027344+868.410532 test-rmse:87819.957813+8213.104948
## [90] train-rmse:55724.408594+849.963138 test-rmse:87841.645313+8256.783414
## [91] train-rmse:55516.001562+851.560080 test-rmse:87824.965625+8286.632695
## [92] train-rmse:55325.757031+809.629745 test-rmse:87862.545312+8246.908108
## [93] train-rmse:55130.828906+751.593831 test-rmse:87844.503125+8187.714687
## [94] train-rmse:54942.117969+825.741431 test-rmse:87921.334375+8210.278438
## [95] train-rmse:54768.475781+851.472640 test-rmse:87934.998438+8236.115728
## [96] train-rmse:54542.211719+863.584589 test-rmse:87980.435937+8254.714900
## [97] train-rmse:54434.634375+903.299824 test-rmse:87989.790625+8326.303684
## [98] train-rmse:54271.241406+866.737832 test-rmse:87955.439063+8301.000391
## Stopping. Best iteration:
## [88] train-rmse:56144.444531+813.858644 test-rmse:87816.032813+8171.278465
#evaluation
elog<- cv$evaluation_log %>%
summarize(ntrees.train = which.min(train_rmse_mean), ntrees.test = which.min(test_rmse_mean))
elog
## ntrees.train ntrees.test
## 1 98 88
nrounds <-88
#fit xgboost model
fit_xgb<-xgboost(data=as.matrix(train_rf_treat), label = train_rf$Store_Weekly_Sales,
nrounds = nrounds, max_depth = 6, eta = 0.3,
objective = "reg:linear")
## [04:13:39] WARNING: amalgamation/../src/objective/regression_obj.cu:170: reg:linear is now deprecated in favor of reg:squarederror.
## [1] train-rmse:861916.750000
## [2] train-rmse:632046.375000
## [3] train-rmse:473051.156250
## [4] train-rmse:363791.750000
## [5] train-rmse:289901.718750
## [6] train-rmse:239562.546875
## [7] train-rmse:203265.859375
## [8] train-rmse:181006.921875
## [9] train-rmse:165810.312500
## [10] train-rmse:154943.578125
## [11] train-rmse:147005.062500
## [12] train-rmse:139474.859375
## [13] train-rmse:134075.046875
## [14] train-rmse:128767.507812
## [15] train-rmse:124468.382812
## [16] train-rmse:120405.921875
## [17] train-rmse:115109.187500
## [18] train-rmse:112099.578125
## [19] train-rmse:108541.164062
## [20] train-rmse:106140.250000
## [21] train-rmse:103635.875000
## [22] train-rmse:100809.062500
## [23] train-rmse:98928.773438
## [24] train-rmse:97486.078125
## [25] train-rmse:96023.820312
## [26] train-rmse:93736.617188
## [27] train-rmse:92661.703125
## [28] train-rmse:90555.023438
## [29] train-rmse:88532.289062
## [30] train-rmse:85469.906250
## [31] train-rmse:84673.898438
## [32] train-rmse:83004.625000
## [33] train-rmse:81786.062500
## [34] train-rmse:80850.554688
## [35] train-rmse:79487.125000
## [36] train-rmse:78600.679688
## [37] train-rmse:78028.843750
## [38] train-rmse:76704.085938
## [39] train-rmse:76265.445312
## [40] train-rmse:75721.867188
## [41] train-rmse:75021.039062
## [42] train-rmse:74170.117188
## [43] train-rmse:73394.734375
## [44] train-rmse:71236.289062
## [45] train-rmse:70933.601562
## [46] train-rmse:70584.343750
## [47] train-rmse:69906.820312
## [48] train-rmse:69293.210938
## [49] train-rmse:68653.718750
## [50] train-rmse:68219.328125
## [51] train-rmse:67500.781250
## [52] train-rmse:66990.390625
## [53] train-rmse:66561.617188
## [54] train-rmse:66274.070312
## [55] train-rmse:65882.273438
## [56] train-rmse:65703.921875
## [57] train-rmse:65574.531250
## [58] train-rmse:64736.808594
## [59] train-rmse:64405.402344
## [60] train-rmse:64006.894531
## [61] train-rmse:63691.015625
## [62] train-rmse:63551.902344
## [63] train-rmse:63418.898438
## [64] train-rmse:63176.003906
## [65] train-rmse:62886.625000
## [66] train-rmse:62282.394531
## [67] train-rmse:62118.941406
## [68] train-rmse:61525.636719
## [69] train-rmse:61219.394531
## [70] train-rmse:60788.832031
## [71] train-rmse:60684.953125
## [72] train-rmse:60328.257812
## [73] train-rmse:59824.453125
## [74] train-rmse:59625.078125
## [75] train-rmse:59376.449219
## [76] train-rmse:59187.148438
## [77] train-rmse:59049.410156
## [78] train-rmse:58781.226562
## [79] train-rmse:58602.574219
## [80] train-rmse:58440.226562
## [81] train-rmse:58253.304688
## [82] train-rmse:58061.941406
## [83] train-rmse:57922.503906
## [84] train-rmse:57780.234375
## [85] train-rmse:57683.617188
## [86] train-rmse:57511.320312
## [87] train-rmse:57419.421875
## [88] train-rmse:57107.171875
#predict test set
test_rf$xgbPred<-predict(fit_xgb, as.matrix(test_rf_treat))
str(test_rf)
## Classes 'grouped_df', 'tbl_df', 'tbl' and 'data.frame': 1755 obs. of 8 variables:
## $ Date : Date, format: "2012-02-03" "2012-02-03" ...
## $ Store : Factor w/ 45 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ IsHoliday : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Week : num 5 5 5 5 5 5 5 5 5 5 ...
## $ Type : Factor w/ 3 levels "A","B","C": 1 1 2 1 2 1 2 1 2 2 ...
## $ Store_Weekly_Sales: num 1636340 1935300 424961 2173374 333948 ...
## $ pred : num 1559771 1932379 423340 2042503 342949 ...
## $ xgbPred : num 1607558 2015386 442634 2104802 333639 ...
## - attr(*, "groups")=Classes 'tbl_df', 'tbl' and 'data.frame': 1755 obs. of 5 variables:
## ..$ Date : Date, format: "2012-02-03" ...
## ..$ Store : Factor w/ 45 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
## ..$ IsHoliday: logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## ..$ Week : num 5 5 5 5 5 5 5 5 5 5 ...
## ..$ .rows :List of 1755
## .. ..$ : int 1
## .. ..$ : int 2
## .. ..$ : int 3
## .. ..$ : int 4
## .. ..$ : int 5
## .. ..$ : int 6
## .. ..$ : int 7
## .. ..$ : int 8
## .. ..$ : int 9
## .. ..$ : int 10
## .. ..$ : int 11
## .. ..$ : int 12
## .. ..$ : int 13
## .. ..$ : int 14
## .. ..$ : int 15
## .. ..$ : int 16
## .. ..$ : int 17
## .. ..$ : int 18
## .. ..$ : int 19
## .. ..$ : int 20
## .. ..$ : int 21
## .. ..$ : int 22
## .. ..$ : int 23
## .. ..$ : int 24
## .. ..$ : int 25
## .. ..$ : int 26
## .. ..$ : int 27
## .. ..$ : int 28
## .. ..$ : int 29
## .. ..$ : int 30
## .. ..$ : int 31
## .. ..$ : int 32
## .. ..$ : int 33
## .. ..$ : int 34
## .. ..$ : int 35
## .. ..$ : int 36
## .. ..$ : int 37
## .. ..$ : int 38
## .. ..$ : int 39
## .. ..$ : int 40
## .. ..$ : int 41
## .. ..$ : int 42
## .. ..$ : int 43
## .. ..$ : int 44
## .. ..$ : int 45
## .. ..$ : int 46
## .. ..$ : int 47
## .. ..$ : int 48
## .. ..$ : int 49
## .. ..$ : int 50
## .. ..$ : int 51
## .. ..$ : int 52
## .. ..$ : int 53
## .. ..$ : int 54
## .. ..$ : int 55
## .. ..$ : int 56
## .. ..$ : int 57
## .. ..$ : int 58
## .. ..$ : int 59
## .. ..$ : int 60
## .. ..$ : int 61
## .. ..$ : int 62
## .. ..$ : int 63
## .. ..$ : int 64
## .. ..$ : int 65
## .. ..$ : int 66
## .. ..$ : int 67
## .. ..$ : int 68
## .. ..$ : int 69
## .. ..$ : int 70
## .. ..$ : int 71
## .. ..$ : int 72
## .. ..$ : int 73
## .. ..$ : int 74
## .. ..$ : int 75
## .. ..$ : int 76
## .. ..$ : int 77
## .. ..$ : int 78
## .. ..$ : int 79
## .. ..$ : int 80
## .. ..$ : int 81
## .. ..$ : int 82
## .. ..$ : int 83
## .. ..$ : int 84
## .. ..$ : int 85
## .. ..$ : int 86
## .. ..$ : int 87
## .. ..$ : int 88
## .. ..$ : int 89
## .. ..$ : int 90
## .. ..$ : int 91
## .. ..$ : int 92
## .. ..$ : int 93
## .. ..$ : int 94
## .. ..$ : int 95
## .. ..$ : int 96
## .. ..$ : int 97
## .. ..$ : int 98
## .. ..$ : int 99
## .. .. [list output truncated]
## ..- attr(*, ".drop")= logi TRUE
#plot prediction vs. test set
ggplot(test_rf, aes(x= xgbPred, y=Store_Weekly_Sales))+
geom_point()+
geom_line(aes(color = factor(Store)))+
geom_abline()
#calculate model RMSE
test_rf %>% mutate (xgbResidual = Store_Weekly_Sales-xgbPred) %>%
group_by(Store) %>%
summarize (xgbRMSE_store = (mean(xgbResidual^2))^0.5) %>%
summarize(xgbRMSE = mean(xgbRMSE_store))
## # A tibble: 1 x 1
## xgbRMSE
## <dbl>
## 1 79746.
Build SVM model
#prepare data